Explore ML and DS survey Ukraine 2021

 · 461 min read
 · Anastasiia Selezen
Table of contents
In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session
In [2]:
# import plotly
# plotly.offline.init_notebook_mode() 
In [3]:
import plotly.io as pio
pio.renderers.default = 'notebook' 

Import libraries

In [4]:
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

pd.set_option("display.max_columns", 150)
pd.set_option('display.max_rows', 999)
pd.set_option('display.max_colwidth', None)

Inroduction

This notebook will explore data from Machine Learning and Data Science Survey competition!

I am a data scientist from Ukraine. So, I'd like to explore subgroup of professionals from my country.

Load data

In [5]:
#data = pd.read_csv('../input/kaggle-survey-2021/kaggle_survey_2021_responses.csv', dtype=object, low_memory=False)
data = pd.read_csv('/Users/anastasiiaselezen/study/blog/content/kaggle_survey_2021_responses.csv', dtype=object, low_memory=False)

data_all = data.iloc[1:,:]
In [6]:
print("There are \033[1m{} countries\033[0m took part in this survey".format(len(data_all['Q3'].value_counts().to_list())))
There are 66 countries took part in this survey
In [7]:
fig = px.treemap(data_all, path=['Q3'], color='Q3')
fig.update_layout(margin = dict(t=60, l=15, r=15, b=15),
                  title_text="<b>Countries Distribution</b>",
                  title_x=0.5,
                  font=dict(family="serif", size=20, color='#000000'))
fig.show()
In [8]:
data_ukraine = data[data['Q3']=='Ukraine']
questions = data.iloc[0, :]
print("Ukrainian Respondents:", data_ukraine.shape[0])
Ukrainian Respondents: 186
In [9]:
# Replace some countries names to make the plot ticks more manageable
data['Q3'].replace({
    'United States of America': 'USA',
    'United Kingdom of Great Britain and Northern Ireland': 'UK',
    'Iran, Islamic Republic of...': 'Iran'
}, inplace=True)

# Calculate number of respondents per Country in 2021
country_cnt_2021 = data['Q3'].value_counts()

# Plot figure object
fig, ax = plt.subplots(1, 1, figsize=(20, 10), dpi=200)

# Bar plot 
bar_colors = ['#0077b6'] * 28 + ['#EEEF20'] + ['#0077b6'] * 2 
ax.bar(country_cnt_2021.index[:31],
        country_cnt_2021[:31],
        color=bar_colors,
        edgecolor='white',
        width=0.9);

# Set title
ax.set_title('Number of Respondents per Country',
             fontsize=20, fontfamily='serif', fontweight='bold', color='black')


# Axes xticks rotation
ax.set_xticklabels(labels=country_cnt_2021.index[:31], rotation=65, fontsize=17)

# Remove yticks
ax.tick_params(left=False, bottom=False)
ax.set_yticklabels([])

# Remove spines
ax.spines['top'].set_visible(False)
ax.spines['right'].set_visible(False)
ax.spines['left'].set_visible(False)

# Annotate bars
for i, bar in enumerate(ax.patches):
    ax.annotate(text=bar.get_height(),
                xy=(bar.get_x() + bar.get_width() / 2, bar.get_height() + 300),
                ha='center',
                va='center',
                rotation=50,
                size=17 if i != 28 else 25)

In the chart above you can see that there are 186 records in oridginal data that we are interested in. Let's dive into it! :)

In [10]:
# select responses relative to Ukraine
data_ukraine = data[data['Q3']=='Ukraine']
questions = data.iloc[0, :]

Exploring gender distribution

In [11]:
data_ukraine['Q2'].value_counts()
Out[11]:
Man      156
Woman     30
Name: Q2, dtype: int64
In [12]:
gender = (
    data_ukraine['Q2']
    .value_counts()
    .to_frame()
    .reset_index()
    .rename(columns={'index':'Gender', 'Q2':'Count'})
    .replace(['Man','Woman'], ['Male', 'Female']) 
    .groupby('Gender')
    .sum()
    .reset_index()    
          )   

fig = go.Figure(data=[go.Pie(labels=gender['Gender'], 
                             values=gender['Count'])])

fig.update_traces(hoverinfo='percent+value', 
                  textinfo='label', 
                  textfont_size=20,
                  marker=dict(colors=['#ffff3f', '#0077b6'], line=dict(color='white', width=5)))

fig.update_layout(showlegend=False, 
                  title_text="<b>Gender Distribution</b>",
                  title_x=0.5,
                  font=dict(family='serif', size=20, color='#000000'))

fig.show()

Not a surprise that there are many more men in the industry, but according to the data women contains 16.1% of all responses.

Exploring educational level of the respondents

In [13]:
data_ukraine['Q4'].replace({
    'Some college/university study without earning a bachelor’s degree': 'No bachelor’s degree',
    'No formal education past high school': 'High school'
}, inplace=True)

education = (
    data_ukraine['Q4']
    .value_counts()
    .to_frame()
    .reset_index()
    .rename(columns={'index':'Education', 'Q4':'Count'})
          )  

education['percent'] = ((education['Count'] / education['Count'].sum())*100).round(2).astype(str) + '%'
colors = ['#014f86', '#2a6f97', '#2c7da0','#468faf', '#61a5c2', '#89c2d9', '#a9d6e5']


fig = go.Figure(go.Bar(
            x=education['Count'],
            y=education['Education'],
            text=education['percent'],
            orientation='h',
            marker_color=colors
                        ))

fig.update_traces(texttemplate='%{text}', 
                  textposition='outside',
                  cliponaxis = False,
                  hovertemplate='<b>Count</b>: %{x}<extra></extra> ',
                  textfont_size=17)
                  
fig.update_xaxes(showgrid=False)
fig.update_yaxes(showgrid=False)
 
fig.update_layout(showlegend=False, 
                  plot_bgcolor='white', 
                  margin=dict(pad=20),
                  xaxis={'showticklabels': False},
                  yaxis_title=None,
                  xaxis_title=None,
                  yaxis={'categoryorder':'total ascending'},
                  title_text="<b>Education Distribution</b>",
                  title_x=0.5,
                  font=dict(family="serif", size=17, color='#000000'),
                  title_font_size=30)
fig.show()

As you can see half of responders have Master's degree. And only 2.15% finished only High school. So, education really matters!

In [14]:
fig = make_subplots(rows=1, cols=2, subplot_titles=("<b>Men</b>", "<b>Women</b>"), shared_yaxes=True)

education_man = (
    data_ukraine[data_ukraine['Q2']=='Man']['Q4']
    .value_counts()
    .to_frame()
    .reset_index()
    .rename(columns={'index':'Education', 'Q4':'Count'})
          )  

education_man['percent'] = ((education_man['Count'] / education_man['Count'].sum())*100).round(2).astype(str) + '%'
colors_education_man = ['#014f86', '#2a6f97', '#2c7da0','#468faf', '#61a5c2', '#89c2d9', '#a9d6e5']

fig.add_trace(
    go.Bar(
            x=education_man['Education'],
            y=education_man['Count'],
            text=education_man['percent'],
            marker_color=colors_education_man),
    row=1, col=1
)
fig.update_traces(texttemplate='%{text}', 
                  textposition='outside',
                  cliponaxis = False,
                  hovertemplate='<b>Count</b>: %{y}<extra></extra>',
                  textfont_size=17)


education_woman = (
    data_ukraine[data_ukraine['Q2']=='Woman']['Q4']
    .value_counts()
    .to_frame()
    .reset_index()
    .rename(columns={'index':'Education', 'Q4':'Count'})
          )   

education_woman['percent'] = ((education_woman['Count'] / education_woman['Count'].sum())*100).round(2).astype(str) + '%'
colors_education_woman = ['#FFA200', '#FFAA00', '#FFB700','#FFC300', '#FFD000', '#FFDD00', '#FFEA00']

fig.add_trace(
   go.Bar(
            x=education_woman['Education'],
            y=education_woman['Count'],
            text=education_woman['percent'],
            marker_color=colors_education_woman),
    row=1, col=2)

fig.update_traces(texttemplate='%{text}', 
                  textposition='outside',
                  cliponaxis = False,
                  hovertemplate='<b>Count</b>: %{y}<extra></extra> ',
                  textfont_size=17)
                  
fig.update_xaxes(showgrid=False)
fig.update_yaxes(showgrid=False)
 
fig.update_layout(showlegend=False, 
                  plot_bgcolor='white', 
                  margin=dict(pad=20),
                  xaxis={'showticklabels': True},
                  yaxis_title=None,
                  xaxis_title=None,
                  yaxis={'categoryorder':'total ascending'},
                  title_text="<b>Education Distribution</b>",
                  title_x=0.5,
                  font=dict(family="serif", size=17, color='#000000'),
                  title_font_size=30)
fig.show()

The same situation if we look on it in gender perspective most respondents have MS or BS degree at least. But Doctoral degree is more popular among men.

Exploring age distribution

In [15]:
fig = make_subplots(rows=1, cols=2, subplot_titles=("<b>Men</b>", "<b>Women</b>"), shared_yaxes=True)

age_man = (
    data_ukraine[data_ukraine['Q2']=='Man']['Q1']
    .value_counts()
    .to_frame()
    .reset_index()
    .rename(columns={'index':'Age', 'Q1':'Count'})
          )  

age_man['percent'] = ((age_man['Count'] / age_man['Count'].sum())*100).round(2).astype(str) + '%'
colors_man = ['#012a4a','#013a63','#01497c','#014f86', '#2a6f97', '#2c7da0','#468faf', '#61a5c2', '#89c2d9', '#a9d6e5']


fig.add_trace(
    go.Bar(
            x=age_man['Age'],
            y=age_man['Count'],
            text=age_man['percent'],
            marker_color=colors_man),
    row=1, col=1
)
fig.update_traces(texttemplate='%{text}', 
                  textposition='outside',
                  cliponaxis = False,
                  hovertemplate='<b>Count</b>: %{y}<extra></extra>',
                  textfont_size=17)


age_woman = (
    data_ukraine[data_ukraine['Q2']=='Woman']['Q1']
    .value_counts()
    .to_frame()
    .reset_index()
    .rename(columns={'index':'Age', 'Q1':'Count'})
          )  

age_woman['percent'] = ((age_woman['Count'] / age_woman['Count'].sum())*100).round(2).astype(str) + '%'
colors_woman = ['#FF7B00','#FF8800','#FF9500','#FFA200', '#FFAA00', '#FFB700','#FFC300', '#FFD000', '#FFDD00', '#FFEA00']

fig.add_trace(
   go.Bar(
            x=age_woman['Age'],
            y=age_woman['Count'],
            text=age_woman['percent'],
            marker_color=colors_woman),
    row=1, col=2)

fig.update_traces(texttemplate='%{text}', 
                  textposition='outside',
                  cliponaxis = False,
                  hovertemplate='<b>Count</b>: %{y}<extra></extra> ',
                  textfont_size=17)
                  
fig.update_xaxes(showgrid=False)
fig.update_yaxes(showgrid=False)
 
fig.update_layout(showlegend=False, 
                  plot_bgcolor='white', 
                  margin=dict(pad=20),
                  xaxis={'showticklabels': True},
                  yaxis_title=None,
                  xaxis_title=None,
                  yaxis={'categoryorder':'total ascending'},
                  title_text="<b>Age Distribution</b>",
                  title_x=0.5,
                  font=dict(family="serif", size=17, color='#000000'),
                  title_font_size=30)
fig.show()

Age is pretty much the same for all genders. Most respondents in between 18-40 years. But there are more men rather then women in the age 40+

Current role

In [16]:
current_role = (
    data_ukraine['Q5']
    .value_counts()
    .to_frame()
    .reset_index()
    .rename(columns={'index':'Current role', 'Q5':'Count'})
          )  

current_role['percent'] = ((current_role['Count'] / current_role['Count'].sum())*100).round(2).astype(str) + '%'
colors = ['#1c204b','#1f2354','#22275c','#252a64', '#282e6c', '#2b3175','#2f357d', '#323885', '#353c8d', '#383f95', '#3b439e', '#3e46a6', '#414aae']


fig = go.Figure(go.Bar(
            x=current_role['Count'],
            y=current_role['Current role'],
            text=current_role['percent'],
            orientation='h',
            marker_color=colors
                        ))

fig.update_traces(texttemplate='%{text}', 
                  textposition='outside',
                  cliponaxis = False,
                  hovertemplate='<b>Count</b>: %{x}<extra></extra> ',
                  textfont_size=17)
                  
fig.update_xaxes(showgrid=False)
fig.update_yaxes(showgrid=False)
 
fig.update_layout(showlegend=False, 
                  plot_bgcolor='white', 
                  margin=dict(pad=20),
                  xaxis={'showticklabels': False},
                  yaxis_title=None,
                  xaxis_title=None,
                  yaxis={'categoryorder':'total ascending'},
                  title_text="<b>Current role</b>",
                  title_x=0.5,
                  font=dict(family="serif", size=17, color='#000000'),
                  title_font_size=30)
fig.show()

Most of ukrainian respondents take role iether Data Scientist, Student or Software Engineer. The role Machine Learning Engineer is occupied only by 14 people (7.53%)

Current role by genders

On the chart below you can click different parts of it to explore allocation of gender, education level and current role.

For me interesting insights that women with BS degree either unemployed or work as a Data Engineers, whereas men with BS takes positions as Data Scientists or Software Engineer in addition.

Men and women with Doctoral degree mostly take roles as Research Scientists and Data Scientists.

Also, according to the data High school is enough for Data Analyst role in Ukraine.

In [17]:
fig = px.sunburst(data_ukraine, path=['Q2', 'Q4', 'Q5'], color='Q2',color_discrete_map={'Man':'#0077b6', 'Woman':'#ffff3f'})
fig.update_layout(showlegend=False, 
                  plot_bgcolor='white', 
                  margin=dict(pad=20),
                  xaxis={'showticklabels': True},
                  yaxis_title=None,
                  xaxis_title=None,
                  yaxis={'categoryorder':'total ascending'},
                  title_text="<b>Distribution of roles by gender and education</b>",
                  title_x=0.5,
                  font=dict(family="serif", size=17, color='#000000'),
                  title_font_size=30)
fig.show()

Years of programming experience

In [18]:
experience = (
    data_ukraine['Q6']
    .value_counts()
    .to_frame()
    .reset_index()
    .rename(columns={'index':'Experience', 'Q6':'Count'})
          )   

experience['percent'] = ((experience['Count'] / experience['Count'].sum())*100).round(2).astype(str) + '%'
colors_experience = ['#014f86', '#2a6f97', '#2c7da0','#468faf', '#61a5c2', '#89c2d9', '#a9d6e5']

fig = go.Figure(
   go.Bar(
            x=experience['Experience'],
            y=experience['Count'],
            text=experience['percent'],
            marker_color=colors_experience))

fig.update_traces(texttemplate='%{text}', 
                  textposition='outside',
                  cliponaxis = False,
                  hovertemplate='<b>Count</b>: %{y}<extra></extra> ',
                  textfont_size=17)
                  
    
experience_woman = (
    data_ukraine[data_ukraine['Q2']=='Woman']['Q6']
    .value_counts()
    .to_frame()
    .reset_index()
    .rename(columns={'index':'Experience', 'Q6':'Count'})
          )  

experience_woman['percent'] = ((experience_woman['Count'] / experience_woman['Count'].sum())*100).round(2).astype(str) + '%'
colors_woman = ['#FF7B00','#FF8800','#FF9500','#FFA200', '#FFAA00', '#FFB700','#FFC300', '#FFD000', '#FFDD00', '#FFEA00']

fig.add_trace(
   go.Bar(
            x=experience_woman['Experience'],
            y=experience_woman['Count'],
            text=experience_woman['percent'],
            marker_color=colors_woman))

fig.update_traces(texttemplate='%{text}', 
                  textposition='outside',
                  cliponaxis = False,
                  hovertemplate='<b>Count</b>: %{y}<extra></extra> ',
                  textfont_size=17)


fig.update_xaxes(showgrid=False)
fig.update_yaxes(showgrid=False)
 
fig.update_layout(showlegend=False, 
                  plot_bgcolor='white', 
                  margin=dict(pad=20),
                  xaxis={'showticklabels': True},
                  yaxis_title=None,
                  xaxis_title=None,
                  yaxis={'categoryorder':'total ascending'},
                  title_text="<b>Experience Distribution</b>",
                  title_x=0.5,
                  font=dict(family="serif", size=17, color='#000000'),
                  title_font_size=30)
fig.show()

From the chart above clearly that by average respondents have 2-5 years of general experience in programming. Only 15 men and 1 woman have 10 to 20 years of experience.

Most popular language

In [19]:
data_ukraine.iloc[:,7:20].describe()
Out[19]:
Q7_Part_1 Q7_Part_2 Q7_Part_3 Q7_Part_4 Q7_Part_5 Q7_Part_6 Q7_Part_7 Q7_Part_8 Q7_Part_9 Q7_Part_10 Q7_Part_11 Q7_Part_12 Q7_OTHER
count 157 14 81 13 33 13 25 1 2 26 11 3 27
unique 1 1 1 1 1 1 1 1 1 1 1 1 1
top Python R SQL C C++ Java Javascript Julia Swift Bash MATLAB None Other
freq 157 14 81 13 33 13 25 1 2 26 11 3 27
In [20]:
data_ukraine_q7 = (data_ukraine.iloc[:,7:20].describe().T['count']
    .to_frame()
    .reset_index()
    .rename(columns={'index':'Languages', 'count':'Count'})
          ) 
data_ukraine_q7['Languages'].replace({
    'Q7_Part_1': 'Python',
    'Q7_Part_2': 'R',
    'Q7_Part_3': 'SQL',
    'Q7_Part_4': 'C',
    'Q7_Part_5': 'C++',
    'Q7_Part_6': 'Java',
    'Q7_Part_7': 'Javascript',
    'Q7_Part_8': 'Julia',
    'Q7_Part_9': 'Swift',
    'Q7_Part_10': 'Bash',
    'Q7_Part_11': 'MATLAB',
    'Q7_Part_12': 'None',
    'Q7_OTHER': 'Other'
}, inplace=True)

fig = go.Figure(data=[go.Pie(labels=data_ukraine_q7['Languages'], 
                             values=data_ukraine_q7['Count'])])

colors = ['#1c204b','#1f2354','#22275c','#252a64', '#282e6c', '#2b3175','#2f357d', '#323885', '#353c8d', '#383f95', '#3b439e', '#3e46a6', '#414aae']

fig.update_traces(hoverinfo='percent+value', 
                  textinfo='label', 
                  textfont_size=20,
                  marker=dict(colors=colors, line=dict(color='white', width=2)))

fig.update_layout(showlegend=False, 
                  title_text="<b>Languages Distribution</b>",
                  title_x=0.5,
                  font=dict(family='serif', size=20, color='#000000'))

fig.show()

As expected the most popular programming language is Python! But interesting that R is less popular in Data Science community in Ukraine then Javascript.

Most recommended language

In [21]:
language_recomended = (data_ukraine['Q8'].value_counts()
    .to_frame()
    .reset_index()
    .rename(columns={'index':'Languages', 'Q8':'Count'})
          ) 

fig = go.Figure(data=[go.Pie(labels=language_recomended['Languages'], 
                             values=language_recomended['Count'])])

colors = ['#1c204b','#1f2354','#22275c','#252a64', '#282e6c', '#2b3175','#2f357d', '#323885', '#353c8d', '#383f95', '#3b439e', '#3e46a6', '#414aae']

fig.update_traces(hoverinfo='percent+value', 
                  textinfo='label', 
                  textfont_size=20,
                  marker=dict(colors=colors, line=dict(color='white', width=2)))

fig.update_layout(showlegend=False,
#                   title_text="<b>Recomended Languages Distribution</b>",
                  title_x=0.5,
                  font=dict(family='serif', size=20, color='#000000'))

fig.show()

Also as expected the most recomendable programming language is Python!

Tools allocation

In [22]:
data_ukraine.iloc[:,21:34].describe()
Out[22]:
Q9_Part_1 Q9_Part_2 Q9_Part_3 Q9_Part_4 Q9_Part_5 Q9_Part_6 Q9_Part_7 Q9_Part_8 Q9_Part_9 Q9_Part_10 Q9_Part_11 Q9_Part_12 Q9_OTHER
count 38 16 32 55 95 13 27 29 16 7 110 3 9
unique 1 1 1 1 1 1 1 1 1 1 1 1 1
top Jupyter (JupyterLab, Jupyter Notebooks, etc) RStudio Visual Studio Visual Studio Code (VSCode) PyCharm Spyder Notepad++ Sublime Text Vim / Emacs MATLAB Jupyter Notebook None Other
freq 38 16 32 55 95 13 27 29 16 7 110 3 9
In [23]:
data_ukraine_q9 = (data_ukraine.iloc[:,21:34].describe().T['count']
    .to_frame()
    .reset_index()
    .rename(columns={'index':'Tools', 'count':'Count'})
          ) 
data_ukraine_q9['Tools'].replace({
    'Q9_Part_1': 'Jupyter (JupyterLab, Jupyter Notebooks, etc)',
    'Q9_Part_2': 'RStudio',
    'Q9_Part_3': 'Visual Studio	',
    'Q9_Part_4': 'Visual Studio Code',
    'Q9_Part_5': 'PyCharm',
    'Q9_Part_6': 'Spyder',
    'Q9_Part_7': 'Notepad++',
    'Q9_Part_8': 'Sublime Text',
    'Q9_Part_9': 'Vim / Emacs',
    'Q9_Part_10': 'MATLAB',
    'Q9_Part_11': 'Jupyter Notebook	',
    'Q9_Part_12': 'None',
    'Q9_OTHER': 'Other'
}, inplace=True)

data_ukraine_q9['percent'] = [((data_ukraine_q9['Count'] / data_ukraine_q9['Count'].sum())*100).iloc[i].round(2).astype(str) + '%' for i in range(13) ]
data_ukraine_q9.sort_values(['Count'], ascending=False, inplace=True)
colors = ['#1c204b','#1f2354','#22275c','#252a64', '#282e6c', '#2b3175','#2f357d', '#323885', '#353c8d', '#383f95', '#3b439e', '#3e46a6', '#414aae']

fig = go.Figure(
   go.Bar(
            x=data_ukraine_q9['Tools'],
            y=data_ukraine_q9['Count'],
            text=data_ukraine_q9['percent'],
            marker_color=colors))

fig.update_traces(texttemplate='%{text}', 
                  textposition='outside',
                  cliponaxis = False,
                  hovertemplate='<b>Count</b>: %{y}<extra></extra> ',
                  textfont_size=17)


fig.update_xaxes(showgrid=False)
fig.update_yaxes(showgrid=False)
 
fig.update_layout(showlegend=False, 
                  plot_bgcolor='white', 
                  margin=dict(pad=40),
                  xaxis={'showticklabels': True},
                  yaxis_title=None,
                  xaxis_title=None,
                  yaxis={'categoryorder':'total ascending'},
                  title_text="<b>Most popular tools</b>",
                  title_x=0.5,
                  font=dict(family="serif", size=17, color='#000000'),
                  title_font_size=30)
fig.show()

Totally agree with chart above. I use first three tools in my projects as well!

What type of computing platform do data scientists use most often?

Another one clickable chart use it to explore allocation of current role and what computational planform these people use.

You can infer that cloud platforms mostly used by Data Scientists and Research Scientist then other roles.

In [24]:
data_ukraine["Q11"].fillna("None", inplace=True)
data_ukraine['Q11'].replace({
    'A laptop': 'Laptop',
    'A personal computer / desktop': 'Desktop',
    'A cloud computing platform (AWS, Azure, GCP, hosted notebooks, etc)': 'Cloud Computing',
    'A deep learning workstation (NVIDIA GTX, LambdaLabs, etc)': 'DL workstation',
    'None': 'Prefer not to answer'
}, inplace=True)


fig = px.sunburst(data_ukraine, path=['Q5', 'Q11'], color='Q5')
fig.update_layout(showlegend=False, 
                  plot_bgcolor='white', 
                  margin=dict(pad=20),
                  xaxis={'showticklabels': True},
                  yaxis_title=None,
                  xaxis_title=None,
                  yaxis={'categoryorder':'total ascending'},
                  title_text="<b>Types of computing platform</b>",
                  title_x=0.5,
                  font=dict(family="serif", size=17, color='#000000'),
                  title_font_size=30)
fig.show()

Which types of specialized hardware do data scientists in Ukraine use on a regular basis?

In [25]:
data_ukraine_q12 = (data_ukraine.iloc[:,52:58].describe().T['count']
    .to_frame()
    .reset_index()
    .rename(columns={'index':'HW', 'count':'Count'})
          )

data_ukraine_q12['HW'].replace({
    'Q12_Part_1': 'NVIDIA GPUs',
    'Q12_Part_2': 'Google Cloud TPUs',
    'Q12_Part_3': 'AWS Trainium Chips',
    'Q12_Part_4': 'AWS Inferentia Chips',
    'Q12_Part_5': 'None',
    'Q12_OTHER': 'Other'
}, inplace=True)

data_ukraine_q12['percent'] = [((data_ukraine_q12['Count'] / data_ukraine_q12['Count'].sum())*100).iloc[i].round(2).astype(str) + '%' for i in range(6)]
data_ukraine_q12.sort_values(['Count'], ascending=False, inplace=True)
colors = ['#1c204b','#1f2354','#22275c','#252a64', '#282e6c', '#2b3175','#2f357d', '#323885', '#353c8d', '#383f95', '#3b439e', '#3e46a6', '#414aae']

fig = go.Figure(
   go.Bar(
            x=data_ukraine_q12['HW'],
            y=data_ukraine_q12['Count'],
            text=data_ukraine_q12['percent'],
            marker_color=colors))

fig.update_traces(texttemplate='%{text}', 
                  textposition='outside',
                  cliponaxis = False,
                  hovertemplate='<b>Count</b>: %{y}<extra></extra> ',
                  textfont_size=17)


fig.update_xaxes(showgrid=False)
fig.update_yaxes(showgrid=False)
 
fig.update_layout(showlegend=False, 
                  plot_bgcolor='white', 
                  margin=dict(pad=40),
                  xaxis={'showticklabels': True},
                  yaxis_title=None,
                  xaxis_title=None,
                  yaxis={'categoryorder':'total ascending'},
                  title_text="<b>Specialized hardware</b>",
                  title_x=0.5,
                  font=dict(family="serif", size=17, color='#000000'),
                  title_font_size=30)
fig.show()

How many times have ukrainian data scientists used a TPU?

In [26]:
tpu_usage = (data_ukraine['Q13'].value_counts()
    .to_frame()
    .reset_index()
    .rename(columns={'index':'tpu', 'Q13':'Count'})
          ) 

fig = go.Figure(data=[go.Pie(labels=tpu_usage['tpu'], 
                             values=tpu_usage['Count'])])

colors = ['#1c204b','#1f2354','#22275c','#252a64', '#282e6c', '#2b3175','#2f357d', '#323885', '#353c8d', '#383f95', '#3b439e', '#3e46a6', '#414aae']

fig.update_traces(hoverinfo='percent+value', 
                  textinfo='label', 
                  textfont_size=20,
                  marker=dict(colors=colors, line=dict(color='white', width=2)))

fig.update_layout(showlegend=False,
                  margin=dict(pad=120),
                  title_text="<b>TPU usage</b>",
                  title_x=0.55,
                  font=dict(family='serif', size=20, color='#000000'))

fig.show()

Sadly, but most ukrainians do not use TPUs or used it just several times.

For how many years have data scientists used machine learning methods?

In [27]:
data_ukraine["Q15"].fillna("None", inplace=True)
data_ukraine['Q15'].replace({
    'None': 'Prefer not to answer'
}, inplace=True)

fig = make_subplots(rows=1, cols=2, subplot_titles=("<b>Men</b>", "<b>Women</b>"), shared_yaxes=True)

experience_man = (
    data_ukraine[data_ukraine['Q2']=='Man']['Q15']
    .value_counts()
    .to_frame()
    .reset_index()
    .rename(columns={'index':'Experience', 'Q15':'Count'})
          )  

experience_man['percent'] = ((experience_man['Count'] / experience_man['Count'].sum())*100).round(2).astype(str) + '%'
colors_man = ['#012a4a','#013a63','#01497c','#014f86', '#2a6f97', '#2c7da0','#468faf', '#61a5c2', '#89c2d9', '#a9d6e5']


fig.add_trace(
    go.Bar(
            x=experience_man['Experience'],
            y=experience_man['Count'],
            text=experience_man['percent'],
            marker_color=colors_man),
    row=1, col=1
)
fig.update_traces(texttemplate='%{text}', 
                  textposition='outside',
                  cliponaxis = False,
                  hovertemplate='<b>Count</b>: %{y}<extra></extra>',
                  textfont_size=17)


experience_woman = (
    data_ukraine[data_ukraine['Q2']=='Woman']['Q15']
    .value_counts()
    .to_frame()
    .reset_index()
    .rename(columns={'index':'Experience', 'Q15':'Count'})
          )  

experience_woman['percent'] = ((experience_woman['Count'] / experience_woman['Count'].sum())*100).round(2).astype(str) + '%'
colors_woman = ['#FF7B00','#FF8800','#FF9500','#FFA200', '#FFAA00', '#FFB700','#FFC300', '#FFD000', '#FFDD00', '#FFEA00']

fig.add_trace(
   go.Bar(
            x=experience_woman['Experience'],
            y=experience_woman['Count'],
            text=experience_woman['percent'],
            marker_color=colors_woman),
    row=1, col=2)

fig.update_traces(texttemplate='%{text}', 
                  textposition='outside',
                  cliponaxis = False,
                  hovertemplate='<b>Count</b>: %{y}<extra></extra> ',
                  textfont_size=17)
                  
fig.update_xaxes(showgrid=False)
fig.update_yaxes(showgrid=False)
 
fig.update_layout(showlegend=False, 
                  plot_bgcolor='white', 
                  margin=dict(pad=20),
                  xaxis={'showticklabels': True},
                  yaxis_title=None,
                  xaxis_title=None,
                  yaxis={'categoryorder':'total ascending'},
                  title_text="<b>Experience Distribution</b>",
                  title_x=0.5,
                  font=dict(family="serif", size=17, color='#000000'),
                  title_font_size=30)
fig.show()

It is plain to see ukrainians have around 3-4 years in applying Machine Learning algorithms! And girls pretty much on the same level with boys. Amazing!

On the sunburst chart below you can explore relation of current role an years of experience in applying ML. Have fun!

In [28]:
fig = px.sunburst(data_ukraine, path=['Q2','Q5', 'Q15'], color='Q2',color_discrete_map={'Man':'#0077b6', 'Woman':'#ffff3f'})
fig.update_layout(showlegend=False, 
                  plot_bgcolor='white', 
                  margin=dict(pad=20),
                  xaxis={'showticklabels': True},
                  yaxis_title=None,
                  xaxis_title=None,
                  yaxis={'categoryorder':'total ascending'},
                  title_text="<b>Relation between current role and years of experience</b>",
                  title_x=0.5,
                  font=dict(family="serif", size=17, color='#000000'),
                  title_font_size=30)
fig.show()

Machine learning frameworks

There are a lot of different frameworks. It is no need to argue which one is better or worse. They are all good for their stuff!

I really enjoy doing deep learning using PyTorch lately :)

In [29]:
data_ukraine.iloc[:,72:90].describe().T
Out[29]:
count unique top freq
Q16_Part_1 102 1 Scikit-learn 102
Q16_Part_2 63 1 TensorFlow 63
Q16_Part_3 56 1 Keras 56
Q16_Part_4 51 1 PyTorch 51
Q16_Part_5 5 1 Fast.ai 5
Q16_Part_6 0 0 NaN NaN
Q16_Part_7 46 1 Xgboost 46
Q16_Part_8 26 1 LightGBM 26
Q16_Part_9 14 1 CatBoost 14
Q16_Part_10 3 1 Prophet 3
Q16_Part_11 4 1 H2O 3 4
Q16_Part_12 6 1 Caret 6
Q16_Part_13 2 1 Tidymodels 2
Q16_Part_14 2 1 JAX 2
Q16_Part_15 9 1 PyTorch Lightning 9
Q16_Part_16 10 1 Huggingface 10
Q16_Part_17 12 1 None 12
Q16_OTHER 7 1 Other 7
In [30]:
data_ukraine_q16 = (data_ukraine.iloc[:,72:90].describe().T['count']
    .to_frame()
    .reset_index()
    .rename(columns={'index':'Libs', 'count':'Count'})
          )
data_ukraine_q16.drop(data_ukraine_q16.index[5], inplace=True)

data_ukraine_q16['Libs'].replace({
    'Q16_Part_1': 'Scikit-learn',
    'Q16_Part_2': 'TensorFlow',
    'Q16_Part_3': 'Keras',
    'Q16_Part_4': 'PyTorch',
    'Q16_Part_5': 'Fast.ai',
    'Q16_Part_7': 'Xgboost',
    'Q16_Part_8': 'LightGBM',
    'Q16_Part_9': 'CatBoost',
    'Q16_Part_10': 'Prophet',
    'Q16_Part_11': 'H2O 3',
    'Q16_Part_12': 'Caret',
    'Q16_Part_13': 'Tidymodels',
    'Q16_Part_14': 'JAX',
    'Q16_Part_15': 'PyTorch Lightning',
    'Q16_Part_16': 'Huggingface',
    'Q16_Part_17': 'None',
    'Q16_OTHER': 'Other'
}, inplace=True)

data_ukraine_q16['percent'] = [((data_ukraine_q16['Count'] / data_ukraine_q16['Count'].sum())*100).iloc[i].round(2).astype(str) + '%' for i in range(17)]
data_ukraine_q16.sort_values(['Count'], ascending=False, inplace=True)


colors = ['#1c204b','#1f2354','#22275c','#252a64', '#282e6c', '#2b3175','#2f357d', '#323885', '#353c8d', '#383f95', '#3b439e', '#3e46a6', '#414aae',
          '#2c7da0','#468faf', '#61a5c2', '#89c2d9', '#a9d6e5']

fig = go.Figure(
   go.Bar(
            x=data_ukraine_q16['Libs'],
            y=data_ukraine_q16['Count'],
            text=data_ukraine_q16['percent'],
            marker_color=colors))

fig.update_traces(texttemplate='%{text}', 
                  textposition='outside',
                  cliponaxis = False,
                  hovertemplate='<b>Count</b>: %{y}<extra></extra> ',
                  textfont_size=17)


fig.update_xaxes(showgrid=False)
fig.update_yaxes(showgrid=False)
 
fig.update_layout(showlegend=False, 
                  plot_bgcolor='white', 
                  margin=dict(pad=40),
                  xaxis={'showticklabels': True},
                  yaxis_title=None,
                  xaxis_title=None,
                  yaxis={'categoryorder':'total ascending'},
                  title_text="<b>Most popular frameworks</b>",
                  title_x=0.5,
                  font=dict(family="serif", size=17, color='#000000'),
                  title_font_size=30)
fig.show()

Popular ML algorithms

In [31]:
data_ukraine.iloc[:,90:102].describe().T
Out[31]:
count unique top freq
Q17_Part_1 92 1 Linear or Logistic Regression 92
Q17_Part_2 76 1 Decision Trees or Random Forests 76
Q17_Part_3 62 1 Gradient Boosting Machines (xgboost, lightgbm, etc) 62
Q17_Part_4 24 1 Bayesian Approaches 24
Q17_Part_5 6 1 Evolutionary Approaches 6
Q17_Part_6 32 1 Dense Neural Networks (MLPs, etc) 32
Q17_Part_7 54 1 Convolutional Neural Networks 54
Q17_Part_8 14 1 Generative Adversarial Networks 14
Q17_Part_9 31 1 Recurrent Neural Networks 31
Q17_Part_10 16 1 Transformer Networks (BERT, gpt-3, etc) 16
Q17_Part_11 14 1 None 14
Q17_OTHER 6 1 Other 6

The same for algorithms. They are all good for their stuff! But the most popular in Ukraine are Regression and Decission Trees. CNNs and Transformers also have their positions. It's cool!

In [32]:
data_ukraine_q17 = (data_ukraine.iloc[:,90:102].describe().T['count']
    .to_frame()
    .reset_index()
    .rename(columns={'index':'algorithms', 'count':'Count'})
          ) 
data_ukraine_q17['algorithms'].replace({
    'Q17_Part_1': 'Linear/Logistic Regression',
    'Q17_Part_2': 'DecisionTrees / RandomForests',
    'Q17_Part_3': 'Gradient Boosting Machines',
    'Q17_Part_4': 'Bayesian Approaches 	',
    'Q17_Part_5': 'Evolutionary Approaches',
    'Q17_Part_6': 'MLPs',
    'Q17_Part_7': 'CNNs',
    'Q17_Part_8': 'GANs',
    'Q17_Part_9': 'RNNs',
    'Q17_Part_10': 'Transformers (BERT, gpt-3)',
    'Q17_Part_11': 'None',
    'Q17_OTHER': 'Other'
}, inplace=True)

fig = go.Figure(data=[go.Pie(labels=data_ukraine_q17['algorithms'], 
                             values=data_ukraine_q17['Count'])])

colors = ['#1f2354','#22275c','#252a64', '#282e6c', '#2b3175','#2f357d', '#323885', '#353c8d', '#383f95', '#3b439e', '#3e46a6', '#414aae']

fig.update_traces(hoverinfo='percent+value', 
                  textinfo='label', 
                  textfont_size=20,
                  marker=dict(colors=colors, line=dict(color='white', width=1)))

fig.update_layout(showlegend=False, 
                  title_text="<b>Algorithms Distribution</b>",
                  title_x=0.5,
                  font=dict(family='serif', size=20, color='#000000'))

fig.show()

Most usable cloud computing platforms

In [33]:
data_ukraine.iloc[:,129:141].describe().T
Out[33]:
count unique top freq
Q27_A_Part_1 32 1 Amazon Web Services (AWS) 32
Q27_A_Part_2 16 1 Microsoft Azure 16
Q27_A_Part_3 27 1 Google Cloud Platform (GCP) 27
Q27_A_Part_4 0 0 NaN NaN
Q27_A_Part_5 0 0 NaN NaN
Q27_A_Part_6 2 1 SAP Cloud 2
Q27_A_Part_7 1 1 Salesforce Cloud 1
Q27_A_Part_8 0 0 NaN NaN
Q27_A_Part_9 0 0 NaN NaN
Q27_A_Part_10 0 0 NaN NaN
Q27_A_Part_11 18 1 None 18
Q27_A_OTHER 2 1 Other 2
In [34]:
data_ukraine_q27 = (data_ukraine.iloc[:,129:141].describe().T['count']
    .to_frame()
    .reset_index()
    .rename(columns={'index':'CC', 'count':'Count'})
          ) 

data_ukraine_q27.drop([data_ukraine_q27.index[3], data_ukraine_q27.index[4],data_ukraine_q27.index[7],
                      data_ukraine_q27.index[8], data_ukraine_q27.index[9],], inplace=True)

data_ukraine_q27['CC'].replace({
    'Q27_A_Part_1': 'AWS',
    'Q27_A_Part_2': 'Microsoft Azure',
    'Q27_A_Part_3': 'GCP',
    'Q27_A_Part_6': 'SAP Cloud',
    'Q27_A_Part_7': 'Salesforce Cloud',
    'Q27_A_Part_11': 'None',
    'Q27_A_OTHER': 'Other'
}, inplace=True)

data_ukraine_q27['percent'] = [((data_ukraine_q27['Count'] / data_ukraine_q27['Count'].sum())*100).iloc[i].round(2).astype(str) + '%' for i in range(7)]
data_ukraine_q27.sort_values(['Count'], ascending=False, inplace=True)


colors = ['#1c204b','#1f2354','#22275c','#252a64', '#282e6c', '#2b3175','#2f357d', '#323885', '#353c8d', '#383f95', '#3b439e', '#3e46a6', '#414aae',
          '#2c7da0','#468faf', '#61a5c2', '#89c2d9', '#a9d6e5']

fig = go.Figure(
   go.Bar(
            x=data_ukraine_q27['CC'],
            y=data_ukraine_q27['Count'],
            text=data_ukraine_q27['percent'],
            marker_color=colors))

fig.update_traces(texttemplate='%{text}', 
                  textposition='outside',
                  cliponaxis = False,
                  hovertemplate='<b>Count</b>: %{y}<extra></extra> ',
                  textfont_size=17)


fig.update_xaxes(showgrid=False)
fig.update_yaxes(showgrid=False)
 
fig.update_layout(showlegend=False, 
                  plot_bgcolor='white', 
                  margin=dict(pad=40),
                  xaxis={'showticklabels': True},
                  yaxis_title=None,
                  xaxis_title=None,
                  yaxis={'categoryorder':'total ascending'},
                  title_text="<b>Popular cloud computing platforms</b>",
                  title_x=0.5,
                  font=dict(family="serif", size=17, color='#000000'),
                  title_font_size=30)
fig.show()

Not all cloud computing platforms are present in Ukrainian respondents' answers, so, AWS and GCP are the most popular in Ukraine.

Do Ukrainians use any of the following data storage products on a regular basis?

In [35]:
data_ukraine.iloc[:,147:155].describe()
Out[35]:
Q30_A_Part_1 Q30_A_Part_2 Q30_A_Part_3 Q30_A_Part_4 Q30_A_Part_5 Q30_A_Part_6 Q30_A_Part_7 Q30_A_OTHER
count 6 5 21 2 19 3 11 2
unique 1 1 1 1 1 1 1 1
top Microsoft Azure Data Lake Storage Microsoft Azure Disk Storage Amazon Simple Storage Service (S3) Amazon Elastic File System (EFS) Google Cloud Storage (GCS) Google Cloud Filestore No / None Other
freq 6 5 21 2 19 3 11 2
In [36]:
data_ukraine_q30A = (data_ukraine.iloc[:,147:155].describe().T['count']
    .to_frame()
    .reset_index()
    .rename(columns={'index':'Storage', 'count':'Count'})
          ) 
data_ukraine_q30A['Storage'].replace({
    'Q30_A_Part_1': 'Microsoft Azure Data Lake Storage',
    'Q30_A_Part_2': 'Microsoft Azure Disk Storage',
    'Q30_A_Part_3': 'Amazon Simple Storage Service (S3)',
    'Q30_A_Part_4': 'Amazon Elastic File System (EFS)',
    'Q30_A_Part_5': 'Google Cloud Storage (GCS)',
    'Q30_A_Part_6': 'Google Cloud Filestore',
    'Q30_A_Part_7': 'None',
    'Q30_A_OTHER': 'Other'
}, inplace=True)

data_ukraine_q30A['percent'] = [((data_ukraine_q30A['Count'] / data_ukraine_q30A['Count'].sum())*100).iloc[i].round(2).astype(str) + '%' for i in range(8)]
data_ukraine_q30A.sort_values(['Count'], ascending=False, inplace=True)

colors = ['#1c204b','#1f2354','#22275c','#252a64', '#282e6c', '#2b3175','#2f357d', '#323885', '#353c8d', '#383f95', '#3b439e', '#3e46a6', '#414aae']


fig = go.Figure(go.Bar(
            x=data_ukraine_q30A['Count'],
            y=data_ukraine_q30A['Storage'],
            text=data_ukraine_q30A['percent'],
            orientation='h',
            marker_color=colors
                        ))

fig.update_traces(texttemplate='%{text}', 
                  textposition='outside',
                  cliponaxis = False,
                  hovertemplate='<b>Count</b>: %{x}<extra></extra> ',
                  textfont_size=17)
                  
fig.update_xaxes(showgrid=False)
fig.update_yaxes(showgrid=False)
 
fig.update_layout(showlegend=False, 
                  plot_bgcolor='white', 
                  margin=dict(pad=20),
                  xaxis={'showticklabels': False},
                  yaxis_title=None,
                  xaxis_title=None,
                  yaxis={'categoryorder':'total ascending'},
                  title_text="<b>Storage products</b>",
                  title_x=0.5,
                  font=dict(family="serif", size=17, color='#000000'),
                  title_font_size=30)
fig.show()

As expected, S3 and Google Cloud Storage are the rock stars! :) It is true. I use it in my work as well!

How about managed machine learning products

In [37]:
data_ukraine.iloc[:,155:165].describe()
Out[37]:
Q31_A_Part_1 Q31_A_Part_2 Q31_A_Part_3 Q31_A_Part_4 Q31_A_Part_5 Q31_A_Part_6 Q31_A_Part_7 Q31_A_Part_8 Q31_A_Part_9 Q31_A_OTHER
count 7 3 1 3 4 0 0 0 57 0
unique 1 1 1 1 1 0 0 0 1 0
top Amazon SageMaker Azure Machine Learning Studio Google Cloud Vertex AI DataRobot Databricks NaN NaN NaN No / None NaN
freq 7 3 1 3 4 NaN NaN NaN 57 NaN

The full list of managed machine learning products that were proposed in survey looks as follows:

● Amazon SageMaker
● Azure Machine Learning Studio
● Google Cloud Vertex AI
● DataRobot
● Databricks
● Dataiku
● Alteryx
● Rapidminer
● No / None
● Other

Dataiku, Alteryx, Rapidminer are out of interest among Ukrainian data scientists.

Besides, not a lot of people really use some managed ML products, but still, Amazon SageMaker has some popularity.

In [38]:
data_ukraine_q31A = (data_ukraine.iloc[:,155:165].describe().T['count']
    .to_frame()
    .reset_index()
    .rename(columns={'index':'managed', 'count':'Count'})
          )

data_ukraine_q31A.drop([data_ukraine_q31A.index[5], data_ukraine_q31A.index[6],data_ukraine_q31A.index[7],
                        data_ukraine_q31A.index[9]], inplace=True)

data_ukraine_q31A['managed'].replace({
    'Q31_A_Part_1': 'Amazon SageMaker',
    'Q31_A_Part_2': 'Azure Machine Learning Studio',
    'Q31_A_Part_3': 'Google Cloud Vertex AI',
    'Q31_A_Part_4': 'DataRobot',
    'Q31_A_Part_5': 'Databricks',
    'Q31_A_Part_9': 'None'
}, inplace=True)

data_ukraine_q31A['percent'] = [((data_ukraine_q31A['Count'] / data_ukraine_q31A['Count'].sum())*100).iloc[i].round(2).astype(str) + '%' for i in range(6)]
data_ukraine_q31A.sort_values(['Count'], ascending=False, inplace=True)

colors = ['#1c204b','#1f2354','#22275c','#252a64', '#282e6c', '#2b3175','#2f357d', '#323885', '#353c8d', '#383f95', '#3b439e', '#3e46a6', '#414aae']


fig = go.Figure(go.Bar(
            x=data_ukraine_q31A['Count'],
            y=data_ukraine_q31A['managed'],
            text=data_ukraine_q31A['percent'],
            orientation='h',
            marker_color=colors
                        ))

fig.update_traces(texttemplate='%{text}', 
                  textposition='outside',
                  cliponaxis = False,
                  hovertemplate='<b>Count</b>: %{x}<extra></extra> ',
                  textfont_size=17)
                  
fig.update_xaxes(showgrid=False)
fig.update_yaxes(showgrid=False)
 
fig.update_layout(showlegend=False, 
                  plot_bgcolor='white', 
                  margin=dict(pad=20),
                  xaxis={'showticklabels': False},
                  yaxis_title=None,
                  xaxis_title=None,
                  yaxis={'categoryorder':'total ascending'},
                  title_text="<b>Managed ML products</b>",
                  title_x=0.5,
                  font=dict(family="serif", size=17, color='#000000'),
                  title_font_size=30)
fig.show()

Big data products on a regular basis & most often used

In [39]:
data_ukraine.iloc[:,165:186].describe()
Out[39]:
Q32_A_Part_1 Q32_A_Part_2 Q32_A_Part_3 Q32_A_Part_4 Q32_A_Part_5 Q32_A_Part_6 Q32_A_Part_7 Q32_A_Part_8 Q32_A_Part_9 Q32_A_Part_10 Q32_A_Part_11 Q32_A_Part_12 Q32_A_Part_13 Q32_A_Part_14 Q32_A_Part_15 Q32_A_Part_16 Q32_A_Part_17 Q32_A_Part_18 Q32_A_Part_19 Q32_A_Part_20 Q32_A_OTHER
count 26 17 12 5 15 0 0 17 5 1 1 1 4 2 11 2 0 0 0 17 2
unique 1 1 1 1 1 0 0 1 1 1 1 1 1 1 1 1 0 0 0 1 1
top MySQL PostgreSQL SQLite Oracle Database MongoDB NaN NaN Microsoft SQL Server Microsoft Azure SQL Database Microsoft Azure Cosmos DB Amazon Redshift Amazon Aurora Amazon RDS Amazon DynamoDB Google Cloud BigQuery Google Cloud SQL NaN NaN NaN None Other
freq 26 17 12 5 15 NaN NaN 17 5 1 1 1 4 2 11 2 NaN NaN NaN 17 2
In [40]:
data_ukraine_q32A = (data_ukraine.iloc[:,165:186].describe().T['count']
    .to_frame()
    .reset_index()
    .rename(columns={'index':'BD', 'count':'Count'})
          )

data_ukraine_q32A.drop([data_ukraine_q32A.index[5], data_ukraine_q32A.index[6],data_ukraine_q32A.index[16],
                        data_ukraine_q32A.index[17], data_ukraine_q32A.index[18]], inplace=True)

data_ukraine_q32A['BD'].replace({
    'Q32_A_Part_1': 'MySQL',
    'Q32_A_Part_2': 'PostgreSQL',
    'Q32_A_Part_3': 'SQLite',
    'Q32_A_Part_4': 'Oracle Database',
    'Q32_A_Part_5': 'MongoDB',
    'Q32_A_Part_8': 'Microsoft SQL Server',
    'Q32_A_Part_9': 'Microsoft Azure SQL Database',
    'Q32_A_Part_10': 'Microsoft Azure Cosmos DB',
    'Q32_A_Part_11': 'Amazon Redshift',
    'Q32_A_Part_12': 'Amazon Aurora',
    'Q32_A_Part_13': 'Amazon RDS',
    'Q32_A_Part_14': 'Amazon DynamoDB',
    'Q32_A_Part_15': 'Google Cloud BigQuery',
    'Q32_A_Part_16': 'Google Cloud SQL',
    'Q32_A_Part_20': 'None',
    'Q32_A_OTHER': 'Other'
}, inplace=True)

data_ukraine_q32A['percent'] = [((data_ukraine_q32A['Count'] / data_ukraine_q32A['Count'].sum())*100).iloc[i].round(2).astype(str) + '%' for i in range(16)]
data_ukraine_q32A.sort_values(['Count'], ascending=False, inplace=True)

colors_32A = ['#1c204b','#1f2354','#22275c','#252a64', '#282e6c', '#2b3175','#2f357d', '#323885', '#353c8d', '#383f95', '#3b439e', '#3e46a6', '#414aae',
          '#2c7da0','#468faf', '#61a5c2', '#89c2d9', '#a9d6e5']
In [41]:
BD_often = (
    data_ukraine['Q33']
    .value_counts()
    .to_frame()
    .reset_index()
    .rename(columns={'index':'BD', 'Q33':'Count'})
          )  
BD_often['percent'] = ((BD_often['Count'] / BD_often['Count'].sum())*100).round(2).astype(str) + '%'
colors_BD_often = ['#FF7B00','#FF8800','#FF9500','#FFA200', '#FFAA00', '#FFB700','#FFC300', '#FFD000', '#FFDD00', '#FFEA00']
In [42]:
fig = make_subplots(rows=2, cols=1, subplot_titles=("<b>Regular basis used</b>", "<b>Most often used</b>"), shared_yaxes=True)

fig.add_trace(
   go.Bar(
            x=data_ukraine_q32A['BD'],
            y=data_ukraine_q32A['Count'],
            text=data_ukraine_q32A['percent'],
            marker_color=colors_32A), row=1, col=1)

fig.update_traces(texttemplate='%{text}', 
                  textposition='outside',
                  cliponaxis = False,
                  hovertemplate='<b>Count</b>: %{y}<extra></extra> ',
                  textfont_size=17)

fig.add_trace(
   go.Bar(
            x=BD_often['BD'],
            y=BD_often['Count'],
            text=BD_often['percent'],
            marker_color=colors_BD_often), row=2, col=1)

fig.update_traces(texttemplate='%{text}', 
                  textposition='outside',
                  cliponaxis = False,
                  hovertemplate='<b>Count</b>: %{y}<extra></extra> ',
                  textfont_size=17)


fig.update_xaxes(showgrid=False)
fig.update_yaxes(showgrid=False)
 
fig.update_layout(height=1100, 
                  showlegend=False, 
                  plot_bgcolor='white', 
                  margin=dict(pad=30),
                  xaxis={'showticklabels': True},
                  yaxis_title=None,
                  xaxis_title=None,
                  yaxis={'categoryorder':'total ascending'},
                  title_text="<b>Big data tools distribution</b>",
                  title_x=0.5,
                  font=dict(family="serif", size=17, color='#000000'),
                  title_font_size=30)

fig.show()

What about automated machine learning tools (or partial AutoML tools) on a regular basis

In [43]:
data_ukraine.iloc[:,205:213].describe()
Out[43]:
Q36_A_Part_1 Q36_A_Part_2 Q36_A_Part_3 Q36_A_Part_4 Q36_A_Part_5 Q36_A_Part_6 Q36_A_Part_7 Q36_A_OTHER
count 10 2 2 0 7 8 51 0
unique 1 1 1 0 1 1 1 0
top Automated data augmentation (e.g. imgaug, albumentations) Automated feature engineering/selection (e.g. tpot, boruta_py) Automated model selection (e.g. auto-sklearn, xcessiv) NaN Automated hyperparameter tuning (e.g. hyperopt, ray.tune, Vizier) Automation of full ML pipelines (e.g. Google AutoML, H2O Driverless AI) No / None NaN
freq 10 2 2 NaN 7 8 51 NaN

Sadly, but not a lot of professionals from Ukraine use any automated machine learning tools.
I used albumentations a few times:)

In [44]:
data_ukraine_q36A = (data_ukraine.iloc[:,205:213].describe().T['count']
    .to_frame()
    .reset_index()
    .rename(columns={'index':'AutoML', 'count':'Count'})
          )
data_ukraine_q36A.drop([data_ukraine_q36A.index[3], data_ukraine_q36A.index[7]], inplace=True)

data_ukraine_q36A['AutoML'].replace({
    'Q36_A_Part_1': 'Automated data augmentation (e.g. imgaug, albumentations)',
    'Q36_A_Part_2': 'Automated feature engineering/selection (e.g. tpot, boruta_py)',
    'Q36_A_Part_3': 'Automated model selection (e.g. auto-sklearn, xcessiv)',
    'Q36_A_Part_5': 'Automated hyperparameter tuning (e.g. hyperopt, ray.tune, Vizier)',
    'Q36_A_Part_6': 'Automation of full ML pipelines (e.g. Google AutoML, H2O Driverless AI)',
    'Q36_A_Part_7': 'No'
}, inplace=True)

data_ukraine_q36A['percent'] = [((data_ukraine_q36A['Count'] / data_ukraine_q36A['Count'].sum())*100).iloc[i].round(2).astype(str) + '%' for i in range(6)]
data_ukraine_q36A.sort_values(['Count'], ascending=False, inplace=True)

colors = ['#323885', '#353c8d', '#383f95', '#3b439e', '#3e46a6', '#414aae']


fig = go.Figure(go.Bar(
            x=data_ukraine_q36A['Count'],
            y=data_ukraine_q36A['AutoML'],
            text=data_ukraine_q36A['percent'],
            orientation='h',
            marker_color=colors
                        ))

fig.update_traces(texttemplate='%{text}', 
                  textposition='outside',
                  cliponaxis = False,
                  hovertemplate='<b>Count</b>: %{x}<extra></extra> ',
                  textfont_size=17)
                  
fig.update_xaxes(showgrid=False)
fig.update_yaxes(showgrid=False)
 
fig.update_layout(showlegend=False, 
                  plot_bgcolor='white', 
                  margin=dict(pad=20),
                  xaxis={'showticklabels': False},
                  yaxis_title=None,
                  xaxis_title=None,
                  yaxis={'categoryorder':'total ascending'},
                  title_text="<b>Automated machine learning tools</b>",
                  title_x=0.5,
                  font=dict(family="serif", size=17, color='#000000'),
                  title_font_size=30)
fig.show()

Tools to help manage machine learning experiments

In [45]:
data_ukraine.iloc[:,221:233].describe()
Out[45]:
Q38_A_Part_1 Q38_A_Part_2 Q38_A_Part_3 Q38_A_Part_4 Q38_A_Part_5 Q38_A_Part_6 Q38_A_Part_7 Q38_A_Part_8 Q38_A_Part_9 Q38_A_Part_10 Q38_A_Part_11 Q38_A_OTHER
count 2 6 0 0 10 0 0 0 0 9 46 3
unique 1 1 0 0 1 0 0 0 0 1 1 1
top Neptune.ai Weights & Biases NaN NaN TensorBoard NaN NaN NaN NaN MLflow No / None Other
freq 2 6 NaN NaN 10 NaN NaN NaN NaN 9 46 3

Yeap, TensorBoard indeed has some popularity in Ukraine!

In [46]:
data_ukraine_q38A = (data_ukraine.iloc[:,221:233].describe().T['count']
    .to_frame()
    .reset_index()
    .rename(columns={'index':'experiments', 'count':'Count'})
          )
data_ukraine_q38A.drop([data_ukraine_q38A.index[2], data_ukraine_q38A.index[3], data_ukraine_q38A.index[5], data_ukraine_q38A.index[6], 
                        data_ukraine_q38A.index[7], data_ukraine_q38A.index[8]], inplace=True)

data_ukraine_q38A['experiments'].replace({
    'Q38_A_Part_1': 'Neptune.ai',
    'Q38_A_Part_2': 'Weights & Biases',
    'Q38_A_Part_5': 'TensorBoard',
    'Q38_A_Part_10': 'MLflow',
    'Q38_A_Part_11': 'No',
    'Q38_A_OTHER': 'Other'
}, inplace=True)

fig = go.Figure(data=[go.Pie(labels=data_ukraine_q38A['experiments'], 
                             values=data_ukraine_q38A['Count'])])

colors = ['#1f2354','#252a64', '#2b3175', '#323885',  '#383f95', '#3b439e', '#3e46a6', '#414aae']

fig.update_traces(hoverinfo='percent+value', 
                  textinfo='label', 
                  textfont_size=20,
                  marker=dict(colors=colors, line=dict(color='white', width=1)))

fig.update_layout(showlegend=False, 
                  title_text="<b>Experiments management tools</b>",
                  title_x=0.55,
                  font=dict(family='serif', size=20, color='#000000'))

fig.show()

Where do Ukrainian data scientists publicly share or deploy their data analysis or machine learning applications?

In [47]:
data_ukraine.iloc[:,233:243].describe()
Out[47]:
Q39_Part_1 Q39_Part_2 Q39_Part_3 Q39_Part_4 Q39_Part_5 Q39_Part_6 Q39_Part_7 Q39_Part_8 Q39_Part_9 Q39_OTHER
count 1 2 0 34 2 22 15 0 25 2
unique 1 1 0 1 1 1 1 0 1 1
top Plotly Dash Streamlit NaN GitHub Personal blog Kaggle Colab NaN I do not share my work publicly Other
freq 1 2 NaN 34 2 22 15 NaN 25 2
In [48]:
data_ukraine_q39 = (data_ukraine.iloc[:,233:243].describe().T['count']
    .to_frame()
    .reset_index()
    .rename(columns={'index':'deploy', 'count':'Count'})
          )
data_ukraine_q39.drop([data_ukraine_q39.index[2], data_ukraine_q39.index[7]], inplace=True)

data_ukraine_q39['deploy'].replace({
    'Q39_Part_1': 'Plotly Dash',
    'Q39_Part_2': 'Streamlit',
    'Q39_Part_4': 'GitHub',
    'Q39_Part_5': 'Personal blog',
    'Q39_Part_6': 'Kaggle',
    'Q39_Part_7': 'Colab',
    'Q39_Part_9': 'I do not share my work publicly',
    'Q39_OTHER': 'Other'
}, inplace=True)

fig = go.Figure(data=[go.Pie(labels=data_ukraine_q39['deploy'], 
                             values=data_ukraine_q39['Count'])])

colors = ['#1f2354','#252a64', '#2b3175', '#323885',  '#383f95', '#3b439e', '#3e46a6', '#414aae']

fig.update_traces(hoverinfo='percent+value', 
                  textinfo='label', 
                  textfont_size=20,
                  marker=dict(colors=colors, line=dict(color='white', width=1)))

fig.update_layout(showlegend=False, 
                  title_text="<b>Public deploy platforms</b>",
                  title_x=0.53,
                  font=dict(family='serif', size=20, color='#000000'))

fig.show()

Upgrade your skills

In [49]:
data_ukraine.iloc[:,243:255].describe()
Out[49]:
Q40_Part_1 Q40_Part_2 Q40_Part_3 Q40_Part_4 Q40_Part_5 Q40_Part_6 Q40_Part_7 Q40_Part_8 Q40_Part_9 Q40_Part_10 Q40_Part_11 Q40_OTHER
count 94 23 69 34 9 18 31 14 5 25 10 15
unique 1 1 1 1 1 1 1 1 1 1 1 1
top Coursera edX Kaggle Learn Courses DataCamp Fast.ai Udacity Udemy LinkedIn Learning Cloud-certification programs (direct from AWS, Azure, GCP, or similar) University Courses (resulting in a university degree) None Other
freq 94 23 69 34 9 18 31 14 5 25 10 15
In [50]:
data_ukraine_q40 = (data_ukraine.iloc[:,243:255].describe().T['count']
    .to_frame()
    .reset_index()
    .rename(columns={'index':'courses', 'count':'Count'})
          )

data_ukraine_q40['courses'].replace({
    'Q40_Part_1': 'Coursera',
    'Q40_Part_2': 'edX',
    'Q40_Part_3': 'Kaggle Learn Courses',
    'Q40_Part_4': 'DataCamp',
    'Q40_Part_5': 'Fast.ai',
    'Q40_Part_6': 'Udacity',
    'Q40_Part_7': 'Udemy',
    'Q40_Part_8': 'LinkedIn Learning',
    'Q40_Part_9': 'Cloud-certification programs',
    'Q40_Part_10': 'University Courses',
    'Q40_Part_11': 'None',
    'Q40_OTHER': 'Other'
}, inplace=True)


data_ukraine_q40['percent'] = [((data_ukraine_q40['Count'] / data_ukraine_q40['Count'].sum())*100).iloc[i].round(2).astype(str) + '%' for i in range(12)]
data_ukraine_q40.sort_values(['Count'], ascending=False, inplace=True)


colors = ['#1f2354','#22275c','#252a64', '#282e6c', '#2b3175','#2f357d', '#323885', '#353c8d', '#383f95', '#3b439e', '#3e46a6', '#414aae',
          '#2c7da0','#468faf', '#61a5c2', '#89c2d9', '#a9d6e5']

fig = go.Figure(
   go.Bar(
            x=data_ukraine_q40['courses'],
            y=data_ukraine_q40['Count'],
            text=data_ukraine_q40['percent'],
            marker_color=colors))

fig.update_traces(texttemplate='%{text}', 
                  textposition='outside',
                  cliponaxis = False,
                  hovertemplate='<b>Count</b>: %{y}<extra></extra> ',
                  textfont_size=17)


fig.update_xaxes(showgrid=False)
fig.update_yaxes(showgrid=False)
 
fig.update_layout(showlegend=False, 
                  plot_bgcolor='white', 
                  margin=dict(pad=40),
                  xaxis={'showticklabels': True},
                  yaxis_title=None,
                  xaxis_title=None,
                  yaxis={'categoryorder':'total ascending'},
                  title_text="<b>Learning platforms</b>",
                  title_x=0.5,
                  font=dict(family="serif", size=17, color='#000000'),
                  title_font_size=30)
fig.show()

I looooove learning! I tried almost all these platforms and they are really great! Never stop exploring something new on your way :)

Popular media sources that report on data science topics

In [51]:
data_ukraine.iloc[:,256:268].describe()
Out[51]:
Q42_Part_1 Q42_Part_2 Q42_Part_3 Q42_Part_4 Q42_Part_5 Q42_Part_6 Q42_Part_7 Q42_Part_8 Q42_Part_9 Q42_Part_10 Q42_Part_11 Q42_OTHER
count 11 15 17 92 20 77 14 59 19 27 4 8
unique 1 1 1 1 1 1 1 1 1 1 1 1
top Twitter (data science influencers) Email newsletters (Data Elixir, O'Reilly Data & AI, etc) Reddit (r/machinelearning, etc) Kaggle (notebooks, forums, etc) Course Forums (forums.fast.ai, Coursera forums, etc) YouTube (Kaggle YouTube, Cloud AI Adventures, etc) Podcasts (Chai Time Data Science, O’Reilly Data Show, etc) Blogs (Towards Data Science, Analytics Vidhya, etc) Journal Publications (peer-reviewed journals, conference proceedings, etc) Slack Communities (ods.ai, kagglenoobs, etc) None Other
freq 11 15 17 92 20 77 14 59 19 27 4 8
In [52]:
data_ukraine_q42 = (data_ukraine.iloc[:,256:268].describe().T['count']
    .to_frame()
    .reset_index()
    .rename(columns={'index':'media', 'count':'Count'})
          )
data_ukraine_q42['media'].replace({
    'Q42_Part_1': 'Twitter (data science influencers)',
    'Q42_Part_2': 'Email newsletters (Data Elixir, OReilly Data & AI, etc)',
    'Q42_Part_3': 'Reddit (r/machinelearning, etc)',
    'Q42_Part_4': 'Kaggle (notebooks, forums, etc)',
    'Q42_Part_5': 'Course Forums (forums.fast.ai, Coursera forums, etc)',
    'Q42_Part_6': 'YouTube (Kaggle YouTube, Cloud AI Adventures, etc)',
    'Q42_Part_7': 'Podcasts (Chai Time Data Science, O’Reilly Data Show, etc)',
    'Q42_Part_8': 'Blogs (Towards Data Science, Analytics Vidhya, etc)',
    'Q42_Part_9': 'Journal Publications (peer-reviewed journals, conference proceedings, etc)',
    'Q42_Part_10': 'Slack Communities (ods.ai, kagglenoobs, etc)',
    'Q42_Part_11': 'None',
    'Q42_OTHER': 'Other'
}, inplace=True)


data_ukraine_q42['percent'] = [((data_ukraine_q42['Count'] / data_ukraine_q42['Count'].sum())*100).iloc[i].round(2).astype(str) + '%' for i in range(12)]
data_ukraine_q42.sort_values(['Count'], ascending=False, inplace=True)


colors = ['#1f2354','#22275c','#252a64', '#282e6c', '#2b3175','#2f357d', '#323885', '#353c8d', '#383f95', '#3b439e', '#3e46a6', '#414aae',
          '#2c7da0','#468faf', '#61a5c2', '#89c2d9', '#a9d6e5']


fig = go.Figure(go.Bar(
            x=data_ukraine_q42['Count'],
            y=data_ukraine_q42['media'],
            text=data_ukraine_q42['percent'],
            orientation='h',
            marker_color=colors
                        ))

fig.update_traces(texttemplate='%{text}', 
                  textposition='outside',
                  cliponaxis = False,
                  hovertemplate='<b>Count</b>: %{x}<extra></extra> ',
                  textfont_size=17)
                  
fig.update_xaxes(showgrid=False)
fig.update_yaxes(showgrid=False)
 
fig.update_layout(showlegend=False, 
                  plot_bgcolor='white', 
                  margin=dict(pad=20),
                  xaxis={'showticklabels': False},
                  yaxis_title=None,
                  xaxis_title=None,
                  yaxis={'categoryorder':'total ascending'},
                  title_text="<b>Media platforms</b>",
                  title_x=0.5,
                  font=dict(family="serif", size=17, color='#000000'),
                  title_font_size=30)
fig.show()

My personal choice YouTube and Blogs!

In what industry is your current employer/contract?

What is the size of the company where you are employed?

How many individuals are responsible for data science workloads at your place of business?

In [53]:
data_ukraine["Q20"].fillna("No answer", inplace=True)
data_ukraine["Q21"].fillna("No answer", inplace=True)
data_ukraine["Q22"].fillna("No answer", inplace=True)

Time to explore idustry!

In [54]:
data_ukraine_q20 = (data_ukraine['Q20'].value_counts()
    .to_frame()
    .reset_index()
    .rename(columns={'index':'industry', 'Q20':'Count'})
          )

data_ukraine_q20['percent'] = ((data_ukraine_q20['Count'] / data_ukraine_q20['Count'].sum())*100).round(2).astype(str) + '%'


colors = ['#1f2354','#22275c','#252a64', '#282e6c',  '#282e6c', '#2b3175','#2f357d', '#323885', '#353c8d', '#383f95', '#3b439e', '#3e46a6', '#414aae',
          '#2c7da0','#468faf', '#61a5c2', '#89c2d9', '#a9d6e5']

fig = go.Figure(
   go.Bar(
            x=data_ukraine_q20['industry'],
            y=data_ukraine_q20['Count'],
            text=data_ukraine_q20['percent'],
            marker_color=colors))

fig.update_traces(texttemplate='%{text}', 
                  textposition='outside',
                  cliponaxis = False,
                  hovertemplate='<b>Count</b>: %{y}<extra></extra> ',
                  textfont_size=17)


fig.update_xaxes(showgrid=False)
fig.update_yaxes(showgrid=False)
 
fig.update_layout(showlegend=False, 
                  plot_bgcolor='white', 
                  margin=dict(pad=40),
                  xaxis={'showticklabels': True},
                  yaxis_title=None,
                  xaxis_title=None,
                  yaxis={'categoryorder':'total ascending'},
                  title_text="<b>Industry allocation</b>",
                  title_x=0.5,
                  font=dict(family="serif", size=17, color='#000000'),
                  title_font_size=30)
fig.show()

Yet another one clickable sunburst plot! Feel free to investigate how size of Data Science Team depends on company size and industry :)

In [55]:
fig = px.sunburst(data_ukraine, path=['Q20','Q21', 'Q22'], color='Q20')
fig.update_layout(showlegend=False, 
                  plot_bgcolor='white', 
                  margin=dict(pad=20),
                  xaxis={'showticklabels': True},
                  yaxis_title=None,
                  xaxis_title=None,
                  yaxis={'categoryorder':'total ascending'},
                  title_text="<b>Relation between industry, size of the company and DS team size</b>",
                  title_x=0.5,
                  font=dict(family="serif", size=17, color='#000000'),
                  title_font_size=30)
fig.show()

Do employers incorporate machine learning methods into their business?

As it might be seen, most companies not use ML in production, but put effort in investigation of this new exciting area!

In [56]:
business = (
    data_ukraine['Q23']
    .value_counts()
    .to_frame()
    .reset_index()
    .rename(columns={'index':'business', 'Q23':'Count'})
          )  

business['percent'] = ((business['Count'] / business['Count'].sum())*100).round(2).astype(str) + '%'
colors = ['#1f2354', '#282e6c', '#2b3175', '#353c8d', '#383f95', '#3e46a6', 
          '#2c7da0','#468faf', '#61a5c2', '#89c2d9', '#a9d6e5']

fig = go.Figure(go.Bar(
            x=business['Count'],
            y=business['business'],
            text=business['percent'],
            orientation='h',
            marker_color=colors
                        ))

fig.update_traces(texttemplate='%{text}', 
                  textposition='outside',
                  cliponaxis = False,
                  hovertemplate='<b>Count</b>: %{x}<extra></extra> ',
                  textfont_size=17)
                  
fig.update_xaxes(showgrid=False)
fig.update_yaxes(showgrid=False)
 
fig.update_layout(showlegend=False, 
                  plot_bgcolor='white', 
                  margin=dict(pad=20),
                  xaxis={'showticklabels': False},
                  yaxis_title=None,
                  xaxis_title=None,
                  yaxis={'categoryorder':'total ascending'},
                  title_text="<b>Employer incorporate machine learning methods</b>",
                  title_x=0.5,
                  font=dict(family="serif", size=17, color='#000000'),
                  title_font_size=30)
fig.show()

Activities that make up an important part of your role at work

In [57]:
data_ukraine.iloc[:,119:127].describe()
  
Out[57]:
Q24_Part_1 Q24_Part_2 Q24_Part_3 Q24_Part_4 Q24_Part_5 Q24_Part_6 Q24_Part_7 Q24_OTHER
count 61 33 41 35 41 31 30 6
unique 1 1 1 1 1 1 1 1
top Analyze and understand data to influence product or business decisions Build and/or run the data infrastructure that my business uses for storing, analyzing, and operationalizing data Build prototypes to explore applying machine learning to new areas Build and/or run a machine learning service that operationally improves my product or workflows Experimentation and iteration to improve existing ML models Do research that advances the state of the art of machine learning None of these activities are an important part of my role at work Other
freq 61 33 41 35 41 31 30 6

My favorite part of my work is building models and applying them to the data. You can see from the chart below that this is indeed an important part of work for a lot of other people!

In [58]:
data_ukraine_q24 = (data_ukraine.iloc[:,119:127].describe().T['count']
    .to_frame()
    .reset_index()
    .rename(columns={'index':'work', 'count':'Count'})
          )
data_ukraine_q24['work'].replace({
    'Q24_Part_1': 'Analyze and understand data to influence product or business decisions',
    'Q24_Part_2': 'Build and/or run the data infrastructure that my business uses for storing, analyzing, and operationalizing data',
    'Q24_Part_3': 'Build prototypes to explore applying machine learning to new areas',
    'Q24_Part_4': 'Build and/or run a machine learning service that operationally improves my product or workflows',
    'Q24_Part_5': 'Experimentation and iteration to improve existing ML models',
    'Q24_Part_6': 'Do research that advances the state of the art of machine learning',
    'Q24_Part_7': 'None of these activities are an important part of my role at work',
    'Q24_OTHER': 'Other'
}, inplace=True)


data_ukraine_q24['percent'] = [((data_ukraine_q24['Count'] / data_ukraine_q24['Count'].sum())*100).iloc[i].round(2).astype(str) + '%' for i in range(8)]
data_ukraine_q24.sort_values(['Count'], ascending=False, inplace=True)

colors = ['#1f2354','#22275c','#252a64', '#282e6c', '#2b3175','#2f357d', '#323885', '#353c8d', '#383f95', '#3b439e', '#3e46a6', '#414aae',
          '#2c7da0','#468faf', '#61a5c2', '#89c2d9', '#a9d6e5']


fig = go.Figure(go.Bar(
            x=data_ukraine_q24['Count'],
            y=data_ukraine_q24['work'],
            text=data_ukraine_q24['percent'],
            orientation='h',
            marker_color=colors
                        ))

fig.update_traces(texttemplate='%{text}', 
                  textposition='outside',
                  cliponaxis = False,
                  hovertemplate='<b>Count</b>: %{x}<extra></extra> ',
                  textfont_size=17)
                  
fig.update_xaxes(showgrid=False)
fig.update_yaxes(showgrid=False)
 
fig.update_layout(showlegend=False, 
                  plot_bgcolor='white', 
                  margin=dict(pad=20),
                  xaxis={'showticklabels': False},
                  yaxis_title=None,
                  xaxis_title=None,
                  yaxis={'categoryorder':'total ascending'},
                  title_text="<b>Important parts of work</b>",
                  title_x=0.5,
                  font=dict(family="serif", size=17, color='#000000'),
                  title_font_size=30)
fig.show()

Current yearly compensation (approximate $USD)

In [59]:
data_ukraine_q25 = (data_ukraine['Q25'].value_counts()
    .to_frame()
    .reset_index()
    .rename(columns={'index':'compensation', 'Q25':'Count'})
          )

data_ukraine_q25['percent'] = ((data_ukraine_q25['Count'] / data_ukraine_q25['Count'].sum())*100).round(2).astype(str) + '%'


colors = ['#1f2354','#22275c','#252a64', '#282e6c', '#2b3175','#2b3175', '#2f357d', '#2f357d', '#323885', '#353c8d', '#353c8d', '#383f95', '#3b439e', '#3e46a6', '#414aae',
          '#2c7da0','#468faf', '#61a5c2', '#89c2d9', '#a9d6e5']

fig = go.Figure(
   go.Bar(
            x=data_ukraine_q25['compensation'],
            y=data_ukraine_q25['Count'],
            text=data_ukraine_q25['percent'],
            marker_color=colors))

fig.update_traces(texttemplate='%{text}', 
                  textposition='outside',
                  cliponaxis = False,
                  hovertemplate='<b>Count</b>: %{y}<extra></extra> ',
                  textfont_size=17)


fig.update_xaxes(showgrid=False)
fig.update_yaxes(showgrid=False)
 
fig.update_layout(showlegend=False, 
                  plot_bgcolor='white', 
                  margin=dict(pad=40),
                  xaxis={'showticklabels': True},
                  yaxis_title=None,
                  xaxis_title=None,
                  yaxis={'categoryorder':'total ascending'},
                  title_text="<b>Yearly compensation</b>",
                  title_x=0.5,
                  font=dict(family="serif", size=17, color='#000000'),
                  title_font_size=30)
fig.show()

Conclusions

I really appreciate that you have read and explored all this stuff! Thank you! I hope you enjoyed it :)

In conclusion, I want to say that Data Science and Machine Learning industry is pretty young in Ukraine, but it is growing extremely fast! I hope, in the near future, our community will have even more talented people who will push data science to the stars!